The aim of this experiment is to evaluate the clickbait detector model and find out what kind of clickbait does it fail to detect.
In [10]:
from keras.models import load_model
from keras.preprocessing import sequence
import sys
import string
import re
UNK = "<UNK>"
PAD = "<PAD>"
MATCH_MULTIPLE_SPACES = re.compile("\ {2,}")
SEQUENCE_LENGTH = 20
In [11]:
model = load_model("../models/detector.h5")
vocabulary = open("../data/vocabulary.txt").read().split("\n")
inverse_vocabulary = dict((word, i) for i, word in enumerate(vocabulary))
In [12]:
clickbait = open("../data/clickbait.valid.txt").read().split("\n")
genuine = open("../data/genuine.valid.txt").read().split("\n")
print "Clickbait: "
for each in clickbait[:5]:
print each
print "-" * 50
print "Genuine: "
for each in genuine[:5]:
print each
In [13]:
def words_to_indices(words):
return [inverse_vocabulary.get(word, inverse_vocabulary[UNK]) for word in words]
def clean(text):
for punctuation in string.punctuation:
text = text.replace(punctuation, " " + punctuation + " ")
for i in range(10):
text = text.replace(str(i), " " + str(i) + " ")
text = MATCH_MULTIPLE_SPACES.sub(" ", text)
return text
In [14]:
wrong_genuine_count = 0
for each in genuine:
cleaned = clean(each.encode("ascii", "ignore").lower()).split()
indices = words_to_indices(cleaned)
indices = sequence.pad_sequences([indices], maxlen=SEQUENCE_LENGTH)
prediction = model.predict(indices)[0, 0]
if prediction > .5:
print prediction, each
wrong_genuine_count += 1
print "-" * 50
print "{0} out of {1} wrong.".format(wrong_genuine_count, len(genuine))
In [15]:
wrong_clickbait_count = 0
for each in clickbait:
cleaned = clean(each.encode("ascii", "ignore").lower()).split()
indices = words_to_indices(cleaned)
indices = sequence.pad_sequences([indices], maxlen=SEQUENCE_LENGTH)
prediction = model.predict(indices)[0, 0]
if prediction < .5:
print prediction, each
wrong_clickbait_count += 1
print "-" * 50
print "{0} out of {1} wrong.".format(wrong_clickbait_count, len(clickbait))